This document provides an analysis using a one-compartment toxicokinetic model. The dataset used comes from Gammarus pulex exposed to Malathion, as reported by Ashauer (2010). The analysis explores three perspectives: using predefined parameters, performing maximum likelihood estimation (MLE), and incorporating Bayesian approaches to account for parameter uncertainty.
sampling_date column does not directly correspond to
replicate in the dataset used by Sandrine.We load the dataset containing measurements of internal concentrations in Gammarus pulex over time. The dataset is visualized to understand the time course of internal concentrations. Unfortunately, we do not know which points belong to the same replicate.
df <- read.table("data/data_Malathion_Ashauer_2010.csv", header = TRUE, sep = ",")
df$sampling_date <- as.factor(df$sampling_date)
ggplot(data = df, aes(x = time, y = Cinternal)) +
geom_point(size=5, alpha=0.7) +
xlab("Time (days)") +
ylab("Internal measured concentration (picomol/g wet weight)") +
geom_vline(xintercept = 1, linetype="dashed")
We also define the limit of quantification (LOQ).
## Limit of detection nmol/kg(wet weight)
## from Table 5 in SI of Ashauer et al. (2010)
LOQ <- 6
We define a deterministic toxicokinetic model with two parameters:
uptake rate (\(k_u\)) and elimination
rate (\(k_e\)). This model predicts the
internal concentration based on the exposure concentration in water and
the simulation times. Note that the implementation assumes that
tsim is ordered.
bioacc <- function(parameters, C_water, tc, tsim){
k.u <- parameters[1]
k.e <- parameters[2]
tacc <- tsim[tsim <= tc]
Cacc <- k.u*C_water*(1 - exp(-k.e*tacc))/k.e
tdep <- tsim[tsim > tc]
Cdep <- k.u*C_water*(exp(k.e*(tc - tdep)) - exp(-k.e*tdep))/k.e
result <- data.frame(time = c(tacc, tdep),
conc = c(Cacc, Cdep))
return(result)
}
C_water <- unique(df$Cwater) # exposure concentration
tc <- 1 # time until exposure ends [days]
This function computes the 5%, 50%, and 95% intervals for each point in time using Monte Carlo simulations of the model output.
compute.intervals <- function(X, tsim){
intervals <- t(apply(X, 2, function(x){quantile(x, probs=c(0.05, 0.5, 0.95))}))
intervals <- as.data.frame(intervals)
colnames(intervals) <- c("p05", "p50", "p95")
intervals$time <- tsim
intervals
}
Using predefined parameters based on Hendriks et al. (2001), we run the model to predict internal concentrations and compare them to the observed data. We also sample parameter uncertainty and plot the 90% prediction interval.
parameters.P1 <- c(k.u = 112.798, k.e = 0.00296)
tsim <- seq(0, 7.5, length=111)
simu.mean <- bioacc(parameters.P1, C_water, tc, tsim)
ggplot(simu.mean, aes(time, conc)) +
geom_line(col = "orange", linewidth = 1.5) +
xlab("Time (hours)") +
ylab("Internal predicted concentrations") +
geom_vline(xintercept = 1, linetype="dashed") +
geom_point(data = df, aes(time, Cinternal)) +
ggtitle("Perspective 1")
n <- 10000
X.para <- matrix(NA, ncol=length(tsim), nrow=n)
for(i in 1:n){
meanlog.k.u = log10(parameters.P1['k.u']) / log10(exp(1))
meanlog.k.e = log10(parameters.P1['k.e']) / log10(exp(1))
sdlog = log10(10) / log10(exp(1))
para <- c(rlnorm(1, meanlog.k.u, sdlog),
rlnorm(1, meanlog.k.e, sdlog))
X.para[i,] <- bioacc(para, C_water, tc, tsim)$conc
}
intervals.para <- compute.intervals(X.para, tsim)
ggplot(intervals.para) +
geom_line(aes(x=time, y=p50), col = "orange", linewidth = 1.5) +
geom_ribbon(aes(x=time, y=p50, ymin=p05, ymax=p95), alpha = 0.2) +
xlab("Time (hours)") +
ylab("Internal concentration") +
geom_vline(xintercept = 1, linetype="dashed") +
geom_point(data = df, aes(time, Cinternal)) +
ggtitle("Perspective 1 - 90% parameter uncertainty interval")
We estimate the parameters by maximizing the likelihood of the observed data, assuming normal- and gamma-distributed errors. We then fit the model using these MLE parameters.
log.ll.normal <- function(theta, data, LOQ=6) {
conc.pred <- bioacc(theta, unique(data$Cwater), tc = 1, tsim = data$time)$conc
obs <- ifelse(data$Cinternal==0, LOQ/2, data$Cinternal)
sum(dnorm(obs, mean=conc.pred, sd=exp(theta[3]), log=TRUE))
}
mle <- maxLik::maxLik(log.ll.normal,
start = c(k.u=112, k.e=0.003, log.sd=log(1)),
data = data.boot, LOQ = LOQ,
method = "NM")
summary(mle)
## --------------------------------------------
## Maximum Likelihood estimation
## Nelder-Mead maximization, 124 iterations
## Return code 0: successful convergence
## Log-Likelihood: -309.6196
## 3 free parameters
## Estimates:
## Estimate Std. error t value Pr(> t)
## k.u 112.94132 2.91237 38.78 <2e-16 ***
## k.e 1.14437 0.08087 14.15 <2e-16 ***
## log.sd 3.00441 0.08452 35.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## --------------------------------------------
parameters.P2 <- mle$estimate
simu.mean <- bioacc(parameters.P2, C_water, tc, tsim)
ggplot(simu.mean, aes(time, conc)) +
geom_line(col = "orange", linewidth = 1.5) +
xlab("Time (hours)") +
ylab("Internal predicted concentrations") +
geom_vline(xintercept = 1, linetype="dashed") +
geom_point(data = df, aes(time, Cinternal)) +
ggtitle("MLE fit for normal errors")
We repeat the above process for gamma-distributed errors.
log.ll.gamma <- function(theta, data, LOQ=6) {
mode <- bioacc(theta, unique(data$Cwater), tc = 1, tsim = data$time)$conc
sd <- exp(theta[3])
scale <- 0.5*(sqrt(4*sd^2+mode^2) - mode)
shape <- mode/scale + 1
obs <- ifelse(data$Cinternal==0, LOQ/2, data$Cinternal)
sum(dgamma(obs, shape=shape, scale=scale, log=TRUE))
}
mle.g <- maxLik::maxLik(log.ll.gamma,
start = c(k.u=112, k.e=0.003, log.sd=log(1)),
data = data.boot, LOQ = LOQ,
method= "NM")
summary(mle.g)
## --------------------------------------------
## Maximum Likelihood estimation
## Nelder-Mead maximization, 170 iterations
## Return code 0: successful convergence
## Log-Likelihood: -300.9527
## 3 free parameters
## Estimates:
## Estimate Std. error t value Pr(> t)
## k.u 109.41929 2.07022 52.85 <2e-16 ***
## k.e 1.23121 0.07784 15.82 <2e-16 ***
## log.sd 2.97415 0.09497 31.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## --------------------------------------------
parameters.P2.g <- mle$estimate
simu.mean.g <- bioacc(parameters.P2.g, C_water, tc, tsim)
ggplot(simu.mean.g, aes(time, conc)) +
geom_line(col = "orange", linewidth = 1.5) +
xlab("Time (hours)") +
ylab("Internal predicted concentrations") +
geom_vline(xintercept = 1, linetype="dashed") +
geom_point(data = df, aes(time, Cinternal)) +
ggtitle("MLE fit for gamma errors")
Maximum likelihood estimation computes confidence intervals for the parameters based on large-sample-size approximations. Bootstrapping avoids these approximations and also offers a straightforward way to propagate parameter uncertainty through the model to compute confidence intervals for predictions.
# run non-parametric bootstrap
M <- 3000
thetas.boot <- matrix(NA, ncol=3, nrow=M)
colnames(thetas.boot) <- c("k.u", "k.e", "log.sd")
for(i in 1:M){
idx <- sample(1:nrow(df), nrow(df), replace=TRUE)
data.boot <- df[idx,]
data.boot <- data.boot[order(data.boot$time),]
fit <- maxLik::maxNM(log.ll.normal,
start = parameters.P2,
data = data.boot, LOQ = LOQ,
parscale = c(100, 1, 5))
if(fit$code <= 2){ # check for convergence
thetas.boot[i,] <- coef(fit)
}
}
pairs(thetas.boot, col=gray(0, alpha=0.1))
## propagate parameters through models
X.para <- X.total <- matrix(NA, ncol=length(tsim), nrow=M)
for(i in 1:M){
para <- thetas.boot[i,]
X.para[i,] <- bioacc(para, C_water, tc, tsim)$conc
sd <- exp(para[3])
X.total[i,] <- rnorm(length(tsim), mean=X.para[i,], sd=sd)
}
intervals.para <- compute.intervals(X.para, tsim)
intervals.total <- compute.intervals(X.total, tsim)
ggplot() +
geom_ribbon(data=intervals.total, aes(x=time, y=p50, ymin=p05, ymax=p95), fill = gray(0.75)) +
geom_ribbon(data=intervals.para, aes(x=time, y=p50, ymin=p05, ymax=p95), fill = gray(0.6)) +
geom_line(data = simu.mean, aes(time, conc), col = "orange", linewidth = 1.5) +
xlab("Time (hours)") +
ylab("Internal concentration") +
geom_vline(xintercept = 1, linetype="dashed") +
geom_point(data = df, aes(time, Cinternal)) +
ggtitle("Bootstrapped 90% confidence and prediction intervals with normal errors")
Again, we repeat the same procedure for the likelihood function based on the gamma distribution.
# run non-parametric bootstrap
M <- 3000
thetas.boot.g <- matrix(NA, ncol=3, nrow=M)
colnames(thetas.boot.g) <- c("k.u", "k.e", "log.sd")
for(i in 1:M){
idx <- sample(1:nrow(df), nrow(df), replace=TRUE)
data.boot <- df[idx,]
data.boot <- data.boot[order(data.boot$time),]
fit <- maxLik::maxNM(log.ll.gamma,
start = parameters.P2.g,
data = data.boot, LOQ = LOQ,
parscale = c(100, 1, 5))
if(fit$code <= 2){ #check for convergence
thetas.boot.g[i,] <- coef(fit)
}
}
pairs(thetas.boot.g, col=gray(0, alpha=0.1))
## propagate parameters through models
X.para.g <- X.total.g <- matrix(NA, ncol=length(tsim), nrow=M)
for(i in 1:M){
para <- thetas.boot.g[i,]
X.para.g[i,] <- mode <- bioacc(para, C_water, tc, tsim)$conc
sd <- exp(para[3])
scale <- 0.5*(sqrt(4*sd^2+mode^2) - mode)
shape <- mode/scale + 1
X.total.g[i,] <- rgamma(length(tsim), scale=scale, shape=shape)
}
intervals.para.g <- compute.intervals(X.para.g, tsim)
intervals.total.g <- compute.intervals(X.total.g, tsim)
ggplot() +
geom_ribbon(data=intervals.total.g, aes(x=time, y=p50, ymin=p05, ymax=p95), fill = gray(0.75)) +
geom_ribbon(data=intervals.para.g, aes(x=time, y=p50, ymin=p05, ymax=p95), fill = gray(0.6)) +
geom_line(data = simu.mean.g, aes(time, conc), col = "orange", linewidth = 1.5) +
xlab("Time (hours)") +
ylab("Internal concentration") +
geom_vline(xintercept = 1, linetype="dashed") +
geom_point(data = df, aes(time, Cinternal)) +
ggtitle("Bootstrapped 90% confidence and prediction intervals with gamma errors")
We incorporate prior knowledge into the model using a Bayesian approach, which allows us to sample from the posterior distribution of the parameters and estimate the uncertainty in our predictions.
As in Perspective 1, the priors for the parameters are based on Hendriks et al. (2001).
log.prior <- function(theta) {
k.u <- 112.798
k.e <- 0.00296
meanlog.k.u = log10(k.u) / log10(exp(1))
meanlog.k.e = log10(k.e) / log10(exp(1))
sdlog = log10(10) / log10(exp(1))
dlnorm(theta[1], meanlog.k.u, sdlog, log=TRUE) +
dlnorm(theta[2], meanlog.k.e, sdlog, log=TRUE) +
dnorm(theta[3], 2, 2, log=TRUE)
}
We define functions proportional to the posterior distribution, incorporating both likelihood and prior information.
log.post.normal <- function(theta, data, ...) {
log.ll.normal(theta, data, ...) + log.prior(theta)
}
log.post.gamma <- function(theta, data, ...) {
log.ll.gamma(theta, data, ...) + log.prior(theta)
}
We sample from the posterior distribution using a Markov chain Monte Carlo (MCMC) method. This helps us quantify the uncertainty in the model predictions.
pp.normal <- MCMC(log.post.normal, n=15000,
init = c(k.u=90, k.e=1, log.sd=2),
scale = c(100, 1, 1), acc.rate=0.2,
showProgressBar = FALSE,
data=df, LOQ = LOQ)
## generate 15000 samples
post.normal <- convert.to.coda(pp.normal)
plot(post.normal) # whole chain
pairs(pp.normal$samples[5000:15000,], col=gray(0, alpha=0.02))
pp.gamma <- MCMC(log.post.gamma, n=15000,
init = c(k.u=90, k.e=1, log.sd=2),
scale = c(100, 1, 1),
showProgressBar = FALSE,
data=df, acc.rate=0.2)
## generate 15000 samples
post.gamma <- convert.to.coda(pp.gamma)
plot(post.gamma) # whole chain
pairs(pp.gamma$samples[5000:15000,], col=gray(0, alpha=0.02))
Finally, we plot the 90% credible intervals for normal and gamma errors, incorporating both parameter and total uncertainty.
n <- 10000
X.para <- X.total <- matrix(NA, ncol=length(tsim), nrow=n)
for(i in 1:n){
idx <- sample(5000:15000, 1) # avoid the first 5000 burn-in samples
para <- pp.normal$samples[idx,]
X.para[i,] <- bioacc(para, C_water, tc, tsim)$conc
sd <- exp(para[3])
X.total[i,] <- rnorm(length(tsim), mean=X.para[i,], sd=sd)
}
intervals.para <- compute.intervals(X.para, tsim)
intervals.total <- compute.intervals(X.total, tsim)
ggplot(intervals.para) +
geom_line(aes(x=time, y=p50), col = "orange", linewidth = 1.5) +
geom_ribbon(aes(x=time, y=p50, ymin=p05, ymax=p95), alpha = 0.2) +
geom_ribbon(data=intervals.total, aes(x=time, y=p50, ymin=p05, ymax=p95), alpha = 0.2) +
xlab("Time (hours)") +
ylab("Internal concentration") +
geom_vline(xintercept = 1, linetype="dashed") +
geom_point(data = df, aes(time, Cinternal)) +
ggtitle("Bayesian 90% parameter and total uncertainty interval with normal errors")
n <- 10000
X.para <- X.total <- matrix(NA, ncol=length(tsim), nrow=n)
for(i in 1:n){
idx <- sample(5000:15000, 1) # avoid the first 5000 burn-in samples
para <- pp.gamma$samples[idx,]
X.para[i,] <- mode <- bioacc(para, C_water, tc, tsim)$conc
sd <- exp(para[3])
scale <- 0.5*(sqrt(4*sd^2+mode^2) - mode)
shape <- mode/scale + 1
X.total[i,] <- rgamma(length(tsim), scale=scale, shape=shape)
}
intervals.para <- compute.intervals(X.para, tsim)
intervals.total <- compute.intervals(X.total, tsim)
ggplot(intervals.para) +
geom_line(aes(x=time, y=p50), col = "orange", linewidth = 1.5) +
geom_ribbon(aes(x=time, y=p50, ymin=p05, ymax=p95), alpha = 0.2) +
geom_ribbon(data=intervals.total, aes(x=time, y=p50, ymin=p05, ymax=p95), alpha = 0.2) +
xlab("Time (hours)") +
ylab("Internal concentration") +
geom_vline(xintercept = 1, linetype="dashed") +
geom_point(data = df, aes(time, Cinternal)) +
ggtitle("Bayesian 90% parameter and total uncertainty interval with gamma errors")
For Perspective 3, we can see how much we have learned from the data by comparing the prior and the posterior distributions. Note that, strictly speaking, the bootstrap distribution cannot be compared to the prior. The prior represents (inter-)subjective prior knowledge, whereas the bootstrap reflects the distribution of the frequentist estimator of the parameter.
## Define density of prior distributions for the parameters separately
k.u <- 112.798
meanlog.k.u <- log10(k.u) / log10(exp(1))
k.e <- 0.00296
meanlog.k.e <- log10(k.e) / log10(exp(1))
sdlog <- log10(10) / log10(exp(1))
d.ku <- function(theta) dlnorm(theta, meanlog.k.u, sdlog, log=F)
d.ke <- function(theta) dlnorm(theta, meanlog.k.e, sdlog, log=F)
par(mfrow=c(2,2))
ku.plt <- seq(0, 300, 1)
ke.plt <- seq(0, 3, 0.01)
## --- bootstrapping
hist(thetas.boot[,"k.u"], xlim=c(0, 300),
xlab="k.u", main = "Prior and bootstrap distribution", probability = TRUE, breaks=25)
lines(ku, d.ku(ku), type="l", col="red", lwd=2)
hist(thetas.boot[,"k.e"], xlim=c(0, 3),
xlab="k.e", main = "Prior and bootstrap distribution", probability = TRUE, breaks=25)
lines(ke, d.ke(ke), type="l", col="red", lwd=2)
## --- Bayesian
hist(pp.normal$samples[5000:15000,"k.u"], xlim=c(0, 300),
xlab="k.u", main = "Prior and Bayesian posterior", probability = TRUE, breaks=25)
lines(ku.plt, d.ku(ku.plt), type="l", col="red", lwd=2)
hist(pp.normal$samples[5000:15000,"k.e"], xlim=c(0, 3),
xlab="k.e", main = "Prior and Bayesian posterior", probability = TRUE, breaks=25)
lines(ke.plt, d.ke(ke.plt), type="l", col="red", lwd=2)
par(mfrow=c(2,2))
ku.plt <- seq(0, 300, 1)
ke.plt <- seq(0, 3, 0.01)
## --- bootstrapping
hist(thetas.boot.g[,"k.u"], xlim=c(0, 300),
xlab="k.u", main = "Prior and bootstrap distribution", probability = TRUE, breaks=25)
lines(ku, d.ku(ku), type="l", col="red", lwd=2)
hist(thetas.boot.g[,"k.e"], xlim=c(0, 3),
xlab="k.e", main = "Prior and bootstrap distribution", probability = TRUE, breaks=25)
lines(ke, d.ke(ke), type="l", col="red", lwd=2)
## --- bayes
hist(pp.gamma$samples[5000:15000,"k.u"], xlim=c(0, 300),
xlab="k.u", main = "Prior and Bayesian posterior", probability = TRUE, breaks=25)
lines(ku.plt, d.ku(ku.plt), type="l", col="red", lwd=2)
hist(pp.gamma$samples[5000:15000,"k.e"], xlim=c(0, 3),
xlab="k.e", main = "Prior and Bayesian posterior", probability = TRUE, breaks=25)
lines(ke.plt, d.ke(ke.plt), type="l", col="red", lwd=2)